0.1 Code chunk 1 - Set up, data import and inspection code for the following

1.A. Remember to use getwd() and setwd() to set the working directory in your rmarkdown file. For example, mydir <- getwd() & setwd(mydir)

library(rmarkdown)
library(psych)
library(scatterplot3d)
library(caret)
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
## Loading required package: lattice
# Import data using read.csv().  Do not coerce the character variables to factors automatically when loading the data.  Examine the overall ‘structure’ of the input data.

mydir <- getwd()
setwd(mydir)
balanced <- read.csv(file = "CD_additional_balanced.csv", stringsAsFactors = FALSE)
# Examine the overall 'structure' of the input data
str(balanced)
## 'data.frame':    9280 obs. of  21 variables:
##  $ age           : int  41 49 49 41 45 42 39 28 44 42 ...
##  $ job           : chr  "blue-collar" "entrepreneur" "technician" "technician" ...
##  $ marital       : chr  "divorced" "married" "married" "married" ...
##  $ education     : chr  "basic.4y" "university.degree" "basic.9y" "professional.course" ...
##  $ default       : chr  "unknown" "unknown" "no" "unknown" ...
##  $ housing       : chr  "yes" "yes" "no" "yes" ...
##  $ loan          : chr  "no" "no" "no" "no" ...
##  $ contact       : chr  "telephone" "telephone" "telephone" "telephone" ...
##  $ month         : chr  "may" "may" "may" "may" ...
##  $ day_of_week   : chr  "mon" "mon" "mon" "mon" ...
##  $ duration      : int  1575 1042 1467 579 461 673 935 1201 1030 1623 ...
##  $ campaign      : int  1 1 1 1 1 2 3 1 1 1 ...
##  $ pdays         : int  999 999 999 999 999 999 999 999 999 999 ...
##  $ previous      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ poutcome      : chr  "nonexistent" "nonexistent" "nonexistent" "nonexistent" ...
##  $ emp.var.rate  : num  1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
##  $ cons.price.idx: num  94 94 94 94 94 ...
##  $ cons.conf.idx : num  -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
##  $ euribor3m     : num  4.86 4.86 4.86 4.86 4.86 ...
##  $ nr.employed   : num  5191 5191 5191 5191 5191 ...
##  $ y             : chr  "yes" "yes" "yes" "yes" ...

1.B. Transform all of the character variables that include categorical values to factor variables. After this transformation, show the overall ‘structure’ and the ‘summary’ of the input data.

is.factor(as.character(balanced))
## [1] FALSE
# Changing character variables to factor 
balanced$job <- factor(balanced$job)
balanced$marital <- factor(balanced$marital)
balanced$education <- factor(balanced$education)
balanced$default <- factor(balanced$default)
balanced$housing <- factor(balanced$housing)
balanced$loan <- factor(balanced$loan)
balanced$contact <- factor(balanced$contact)
balanced$month <- factor(balanced$month)
balanced$day_of_week <- factor(balanced$day_of_week)
balanced$poutcome <- factor(balanced$poutcome)
balanced$y <- factor(balanced$y)

# Show the overall 'structure' and the 'summary' of the input data
str(balanced)
## 'data.frame':    9280 obs. of  21 variables:
##  $ age           : int  41 49 49 41 45 42 39 28 44 42 ...
##  $ job           : Factor w/ 12 levels "admin.","blue-collar",..: 2 3 10 10 2 2 4 12 8 10 ...
##  $ marital       : Factor w/ 4 levels "divorced","married",..: 1 2 2 2 2 2 2 3 2 2 ...
##  $ education     : Factor w/ 8 levels "basic.4y","basic.6y",..: 1 7 3 6 3 3 3 8 4 6 ...
##  $ default       : Factor w/ 2 levels "no","unknown": 2 2 1 2 2 1 1 2 1 1 ...
##  $ housing       : Factor w/ 3 levels "no","unknown",..: 3 3 1 3 3 3 3 3 3 1 ...
##  $ loan          : Factor w/ 3 levels "no","unknown",..: 1 1 1 1 1 3 1 3 1 1 ...
##  $ contact       : Factor w/ 2 levels "cellular","telephone": 2 2 2 2 2 2 2 2 2 2 ...
##  $ month         : Factor w/ 10 levels "apr","aug","dec",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ day_of_week   : Factor w/ 5 levels "fri","mon","thu",..: 2 2 2 2 2 2 2 4 4 4 ...
##  $ duration      : int  1575 1042 1467 579 461 673 935 1201 1030 1623 ...
##  $ campaign      : int  1 1 1 1 1 2 3 1 1 1 ...
##  $ pdays         : int  999 999 999 999 999 999 999 999 999 999 ...
##  $ previous      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ poutcome      : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ emp.var.rate  : num  1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
##  $ cons.price.idx: num  94 94 94 94 94 ...
##  $ cons.conf.idx : num  -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
##  $ euribor3m     : num  4.86 4.86 4.86 4.86 4.86 ...
##  $ nr.employed   : num  5191 5191 5191 5191 5191 ...
##  $ y             : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
summary(balanced)
##       age                job           marital                   education   
##  Min.   :17.0   admin.     :2517   divorced:1021   university.degree  :3007  
##  1st Qu.:31.0   blue-collar:1769   married :5338   high.school        :2102  
##  Median :38.0   technician :1459   single  :2900   professional.course:1190  
##  Mean   :40.4   services   : 773   unknown :  21   basic.9y           :1177  
##  3rd Qu.:48.0   management : 651                   basic.4y           : 895  
##  Max.   :98.0   retired    : 595                   basic.6y           : 458  
##                 (Other)    :1516                   (Other)            : 451  
##     default        housing          loan           contact         month     
##  no     :7824   no     :4104   no     :7688   cellular :6672   may    :2533  
##  unknown:1456   unknown: 225   unknown: 225   telephone:2608   jul    :1477  
##                 yes    :4951   yes    :1367                    aug    :1353  
##                                                                jun    :1169  
##                                                                nov    : 886  
##                                                                apr    : 785  
##                                                                (Other):1077  
##  day_of_week    duration         campaign          pdays          previous     
##  fri:1763    Min.   :   1.0   Min.   : 1.000   Min.   :  0.0   Min.   :0.0000  
##  mon:1846    1st Qu.: 145.0   1st Qu.: 1.000   1st Qu.:999.0   1st Qu.:0.0000  
##  thu:2000    Median : 265.0   Median : 2.000   Median :999.0   Median :0.0000  
##  tue:1810    Mean   : 387.4   Mean   : 2.333   Mean   :887.3   Mean   :0.3153  
##  wed:1861    3rd Qu.: 528.0   3rd Qu.: 3.000   3rd Qu.:999.0   3rd Qu.:0.0000  
##              Max.   :4199.0   Max.   :39.000   Max.   :999.0   Max.   :6.0000  
##                                                                                
##         poutcome     emp.var.rate     cons.price.idx  cons.conf.idx   
##  failure    :1074   Min.   :-3.4000   Min.   :92.20   Min.   :-50.80  
##  nonexistent:7244   1st Qu.:-1.8000   1st Qu.:92.89   1st Qu.:-42.70  
##  success    : 962   Median :-0.1000   Median :93.44   Median :-41.80  
##                     Mean   :-0.4963   Mean   :93.48   Mean   :-40.22  
##                     3rd Qu.: 1.4000   3rd Qu.:93.99   3rd Qu.:-36.40  
##                     Max.   : 1.4000   Max.   :94.77   Max.   :-26.90  
##                                                                       
##    euribor3m      nr.employed     y       
##  Min.   :0.634   Min.   :4964   no :4640  
##  1st Qu.:1.244   1st Qu.:5076   yes:4640  
##  Median :4.021   Median :5191             
##  Mean   :2.960   Mean   :5135             
##  3rd Qu.:4.959   3rd Qu.:5228             
##  Max.   :5.045   Max.   :5228             
## 

0.2 Code chunk 2 - For each of these numeric variables - age, duration, campaign, and pdays

2.A. Create a histogram and include a title of the histogram.

# Histogram of Age
hist(balanced$age, main = "Histogram of Age in the CD Additional Balanced data set", xlab = "age")

# Histogram of Duration 
hist(balanced$duration, main = "Histogram of Duration in the CD Additional Balanced data set", xlab = "duration")

# Histogram of Campaign 
hist(balanced$campaign, main = "Histogram of Campaign in the CD Additional Balanced data set", xlab = "campaign")

# Histogram of Pdays 
hist(balanced$pdays, main = "Histogram of Pdays in the CD Additional Balanced data set", xlab = "pdays")

2.B. Create a boxplot and include a title in the plot.

# Boxplot of Age
boxplot(balanced$age, main = "Boxplot of Age in the CD Additional Balanced data set", ylab = "age")

# Boxplot of Duration
boxplot(balanced$duration, main = "Boxplot of Duration in the CD Additional Balanced data set", ylab = "duration")

# Boxplot of Campaign
boxplot(balanced$campaign, main = "Boxplot of Campaign in the CD Additional Balanced data set", ylab = "campaign")

# Boxplot of Pdays
boxplot(balanced$pdays, main = "Boxplot of Pdays in the CD Additional Balanced data set", ylab = "pdays")

2.C. Show deciles of the variable.

quantile(balanced$age, seq(from = 0, to = 1, by = 0.10))
##   0%  10%  20%  30%  40%  50%  60%  70%  80%  90% 100% 
##   17   27   30   33   35   38   41   46   51   57   98
quantile(balanced$duration, seq(from = 0, to = 1, by = 0.10))
##   0%  10%  20%  30%  40%  50%  60%  70%  80%  90% 100% 
##    1   80  124  167  211  265  340  452  615  860 4199
quantile(balanced$campaign, seq(from = 0, to = 1, by = 0.10))
##   0%  10%  20%  30%  40%  50%  60%  70%  80%  90% 100% 
##    1    1    1    1    1    2    2    2    3    4   39
quantile(balanced$pdays, seq(from = 0, to = 1, by = 0.10))
##   0%  10%  20%  30%  40%  50%  60%  70%  80%  90% 100% 
##    0   11  999  999  999  999  999  999  999  999  999

0.3 Code chunk 3 – Explore factor variables

3.A. For each of the selected factor variables, and for each of the variable’s levels (e.g., “success”, “failure”, “nonexistent” of poutcome), show the count value and percentage value of instances belonging to that level.

Note: Select variable y and three other factor variables (e.g, job, education and poutcome) for this task. Do not include additional variables.

# Count observations having factor value
y.table <- table(balanced$y)
y.table
## 
##   no  yes 
## 4640 4640
y.perc <- prop.table(table(balanced$y))*100
y.perc
## 
##  no yes 
##  50  50
job.table <- table(balanced$job)
job.table
## 
##        admin.   blue-collar  entrepreneur     housemaid    management 
##          2517          1769           308           216           651 
##       retired self-employed      services       student    technician 
##           595           306           773           358          1459 
##    unemployed       unknown 
##           248            80
job.perc <- prop.table(table(balanced$job))*100
job.perc
## 
##        admin.   blue-collar  entrepreneur     housemaid    management 
##     27.122845     19.062500      3.318966      2.327586      7.015086 
##       retired self-employed      services       student    technician 
##      6.411638      3.297414      8.329741      3.857759     15.721983 
##    unemployed       unknown 
##      2.672414      0.862069
education.table <- table(balanced$education)
education.table
## 
##            basic.4y            basic.6y            basic.9y         high.school 
##                 895                 458                1177                2102 
##          illiterate professional.course   university.degree             unknown 
##                   6                1190                3007                 445
education.perc <- prop.table(table(balanced$education))*100
education.perc
## 
##            basic.4y            basic.6y            basic.9y         high.school 
##          9.64439655          4.93534483         12.68318966         22.65086207 
##          illiterate professional.course   university.degree             unknown 
##          0.06465517         12.82327586         32.40301724          4.79525862
poutcome.table <- table(balanced$poutcome)
poutcome.table
## 
##     failure nonexistent     success 
##        1074        7244         962
poutcome.perc <- prop.table(table(balanced$poutcome))*100
poutcome.perc
## 
##     failure nonexistent     success 
##    11.57328    78.06034    10.36638

3.B. For each of the selected variables, show a bar plot of the number of instances (i.e. count) with a level name for each possible value. Show a descriptive title in each plot.

#Barplot of y 
barplot(y.table, main = "Bar Plot of y in the CD Additional Balanced data set",
        xlab = "y")

#Barplot of job
barplot(job.table, main = "Bar Plot of job in the CD Additional Balanced data set",
        xlab = "job")

#Barplot of education
barplot(education.table, main = "Bar Plot of education in the CD Additional Balanced data set",
        xlab = "education")

#Barplot of poutcome
barplot(poutcome.table, main = "Bar Plot of poutcome in the CD Additional Balanced data set",
        xlab = "poutcome")

0.4 Code chunk 4 – Explore relationships amongst multiple variables

4.A Use cor and pairs.panels to display correlations for these seven numeric variables – age, duration, campaign, pdays, euribor3m, emp.var.rate, and nr.employed.

cor_display <- balanced[c("age", "duration", "campaign", "pdays", "euribor3m", "emp.var.rate", "nr.employed")]

# Correlation of variables
cor(cor_display)
##                       age    duration     campaign       pdays   euribor3m
## age           1.000000000 -0.02072651  0.003690016 -0.05351616 -0.04462745
## duration     -0.020726510  1.00000000 -0.025872465  0.02893622  0.05733951
## campaign      0.003690016 -0.02587247  1.000000000  0.08930062  0.17512283
## pdays        -0.053516156  0.02893622  0.089300624  1.00000000  0.38773934
## euribor3m    -0.044627449  0.05733951  0.175122827  0.38773934  1.00000000
## emp.var.rate -0.049052629  0.07144035  0.185736186  0.33488799  0.95840218
## nr.employed  -0.074686516  0.05823209  0.176972215  0.47499217  0.94054583
##              emp.var.rate nr.employed
## age           -0.04905263 -0.07468652
## duration       0.07144035  0.05823209
## campaign       0.18573619  0.17697221
## pdays          0.33488799  0.47499217
## euribor3m      0.95840218  0.94054583
## emp.var.rate   1.00000000  0.86752989
## nr.employed    0.86752989  1.00000000
# Display of correlated variables
pairs.panels(cor_display)

4.B For each of these numeric variables - duration, emp.var.rate, cons.price.idx, and cons.conf.idx.

  1. Show a boxplot of this numeric variable by y.
  2. Use the aggregate function with ‘summary’ to aggregate this variable by y. The output should be the six number statistics (i.e. min., 1st quantile, median, mean, 3rd quantile, and max.) of the variable (e.g., duration)aggregated by “yes” and “no” respectively of y.

The output of aggregate in task will NOT be visible until you knit. You can test your code by copying pasting to the console.

# Boxplot of variables
boxplot(duration~y, data = balanced)

boxplot(emp.var.rate~y, data = balanced)

boxplot(cons.price.idx~y, data = balanced)

boxplot(cons.conf.idx~y, data = balanced)

# Aggregate of variables
aggregate(duration~y, summary, data = balanced)
##     y duration.Min. duration.1st Qu. duration.Median duration.Mean
## 1  no        1.0000          94.0000        166.0000      221.5323
## 2 yes       37.0000         253.0000        449.0000      553.1912
##   duration.3rd Qu. duration.Max.
## 1         279.2500     1994.0000
## 2         741.2500     4199.0000
aggregate(emp.var.rate~y, summary, data = balanced)
##     y emp.var.rate.Min. emp.var.rate.1st Qu. emp.var.rate.Median
## 1  no        -3.4000000           -1.8000000           1.1000000
## 2 yes        -3.4000000           -1.8000000          -1.8000000
##   emp.var.rate.Mean emp.var.rate.3rd Qu. emp.var.rate.Max.
## 1         0.2409052            1.4000000         1.4000000
## 2        -1.2334483           -0.1000000         1.4000000
aggregate(cons.price.idx~y, summary, data = balanced)
##     y cons.price.idx.Min. cons.price.idx.1st Qu. cons.price.idx.Median
## 1  no            92.20100               93.07500              93.91800
## 2 yes            92.20100               92.89300              93.20000
##   cons.price.idx.Mean cons.price.idx.3rd Qu. cons.price.idx.Max.
## 1            93.60397               93.99400            94.76700
## 2            93.35439               93.91800            94.76700
aggregate(cons.conf.idx~y, summary, data = balanced)
##     y cons.conf.idx.Min. cons.conf.idx.1st Qu. cons.conf.idx.Median
## 1  no          -50.80000             -42.70000            -41.80000
## 2 yes          -50.80000             -46.20000            -40.40000
##   cons.conf.idx.Mean cons.conf.idx.3rd Qu. cons.conf.idx.Max.
## 1          -40.64647             -36.40000          -26.90000
## 2          -39.78978             -36.10000          -26.90000

4.C Draw a 3d scatter plot to show y values in shapes (e.g. circle for “no”, triangle for “yes”) for each of the following combinations of numeric variables (along the three axes). Include a main title for the plot and legend for the shapes of y in the plot. (i) age, campaign and duration (ii) nr.employed, euribor3m and duration

# Scatterplot 3D
scatterplot3d(balanced$age,balanced$campaign,balanced$duration, pch = as.numeric(balanced$y), main = "3D scatter plot of age, campaign and duration in the CD Additional Balanced data")

legend('topright', legend = levels(balanced$y),  cex = 0.8, pch = 1:2)

# Scatterplot 3D
scatterplot3d(balanced$nr.employed,balanced$euribor3m,balanced$duration, pch = as.numeric(balanced$y), main = "3D scatter plot of nr.employed, euribor3m and duration in the CD Additional Balanced data")

legend('topright', legend = levels(balanced$y),  cex = 0.8, pch = 1:2)